The objective of this notebook is to refine the clustering annotation done at level 3. This refinement is the result of a manual curation carried out by specialists to remove poor quality cells, misclassified cells or clusters with very few cells.
library(Seurat)
library(Signac)
library(tidyverse)
library(reshape2)
library(ggpubr)
library(harmony)
library(plotly)
set.seed(1234)
cell_type = "NBC_MBC"
# Paths
path_to_obj <- str_c(
here::here("scATAC-seq/results/R_objects/level_3/"),
cell_type,
"/",
cell_type,
"_integrated_level_3.rds",
sep = ""
)
path_to_obj_RNA <- str_c(
here::here("scRNA-seq/3-clustering/5-level_5/"),
cell_type,
"/",
cell_type,
"_seu_obj_level_5_eta_DZnoproli.rds",
sep = ""
)
# Functions
source(here::here("scRNA-seq/bin/utils.R"))
# Colors
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D",
"#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
"#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
"#FBE426", "#16FF32", "black", "#3283FE",
"#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
"#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
"#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
"#FEAF16", "#683B79", "#B10DA1", "#1C7F93",
"#F8A19F", "dark orange", "#FEAF16", "#FBE426",
"Brown")
path_to_level_4 <- here::here("scATAC-seq/results/R_objects/level_4/NBC_MBC/")
path_to_save <- str_c(path_to_level_4, "NBC_MBC_integrated_level_4.rds")
# Seurat object
seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat
## 270784 features across 38684 samples within 1 assay
## Active assay: peaks_macs (270784 features, 255358 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
seurat_RNA <- readRDS(path_to_obj_RNA)
seurat_RNA
## An object of class Seurat
## 37378 features across 115478 samples within 1 assay
## Active assay: RNA (37378 features, 0 variable features)
## 3 dimensional reductions calculated: pca, harmony, umap
DimPlot(seurat,
pt.size = 0.1, split.by = "assay")
DimPlot(seurat,
pt.size = 0.1, split.by = "age_group")
p1 <- DimPlot(seurat,
pt.size = 0.1) + NoLegend()
p2 <- DimPlot(seurat_RNA,
pt.size = 0.1,cols = color_palette)
p1 + p2
tonsil_RNA_annotation <- seurat_RNA@meta.data %>%
rownames_to_column(var = "cell_barcode") %>%
dplyr::filter(assay == "multiome") %>%
dplyr::select("cell_barcode", "names_level_5_clusters_eta")
tonsil_ATAC_cell_barcode <- seurat@meta.data %>%
rownames_to_column(var = "cell_barcode") %>%
dplyr::filter(assay == "multiome") %>%
dplyr::select("cell_barcode")
possible_doublets_ATAC <- setdiff(tonsil_ATAC_cell_barcode$cell_barcode,tonsil_RNA_annotation$cell_barcode)
seurat$quality_cells <- ifelse(colnames(seurat) %in% possible_doublets_ATAC, "Poor-quality", "Good-quality")
DimPlot(
seurat,
split.by = "quality_cells")
DimPlot(seurat,split.by = "scrublet_predicted_doublet_atac")
table(seurat$scrublet_predicted_doublet_atac)
##
## FALSE TRUE
## 37181 1503
qc_vars <- c(
"nCount_peaks",
"nFeature_peaks",
"nucleosome_signal",
"TSS.enrichment"
)
qc_gg <- purrr::map(qc_vars, function(x) {
p <- FeaturePlot(seurat, features = x, max.cutoff = "q95")
p
})
qc_gg
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
qc_vars <- c("NBC.MBC", "GCBC", "PC", "CD4.T", "Cytotoxic")
qc_gg <- purrr::map(qc_vars, function(x) {
p <- FeaturePlot(seurat, feature = x,
max.cutoff = 4, min.cutoff = -4) +
scale_color_viridis_c(option = "magma")
p
})
qc_gg
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
resolutions <- c(0.1, 0.12, 0.2, 0.3)
seurat <- FindClusters(seurat, resolution = resolutions)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 38684
## Number of edges: 848623
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9284
## Number of communities: 20
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 38684
## Number of edges: 848623
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9198
## Number of communities: 20
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 38684
## Number of edges: 848623
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8964
## Number of communities: 23
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 38684
## Number of edges: 848623
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8773
## Number of communities: 26
## Elapsed time: 6 seconds
vars <- str_c("peaks_macs_snn_res.", resolutions)
umap_clusters <- purrr::map(vars, function(x) {
p <- DimPlot(seurat, group.by = x, cols = color_palette)
p
})
umap_clusters
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
doublet_clusters <- purrr::map(vars, function(x) {
df1 <- data.frame(table(seurat@meta.data[,x], seurat@meta.data$scrublet_predicted_doublet_atac))
colnames(df1) <- c("Cluster", "Scrublet","Cells")
p <- ggbarplot(df1, "Cluster", "Cells",
fill = "Scrublet", color = "Scrublet",
label = TRUE,
position = position_dodge(0.9))
p
})
doublet_clusters
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
doublet_clusters <- purrr::map(vars, function(x) {
df1 <- data.frame(table(seurat@meta.data[,x], seurat$quality_cells))
colnames(df1) <- c("Cluster", "Quality","Cells")
p <- ggbarplot(df1, "Cluster", "Cells",
fill = "Quality",
label = TRUE,
position = position_dodge(0.9))
p
})
doublet_clusters
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
umap_clusters_level1 <- purrr::map(vars, function(x) {
p <- FeatureScatter(seurat,
"UMAP_1_level_1",
"UMAP_2_level_1", group.by = x, cols = color_palette)
p
})
umap_clusters_level1
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
umap_level_1 <- FeatureScatter(
seurat,
"UMAP_1_level_1",
"UMAP_2_level_1",
group.by = "annotation_level_1"
)
umap_level_1 <- umap_level_1 +
theme(
#legend.position = "none",
plot.title = element_blank()
)
umap_level_1
seurat <- subset(seurat,
quality_cells == "Good-quality")
seurat
## An object of class Seurat
## 270784 features across 38569 samples within 1 assay
## Active assay: peaks_macs (270784 features, 255358 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
seurat <- seurat %>%
RunTFIDF() %>%
FindTopFeatures(min.cutoff = "q5") %>%
RunSVD()
#DepthCor(seurat_final)
#seurat <- RunUMAP(object = seurat, reduction = 'lsi', dims = 2:30)
#DimPlot(seurat)
seurat <- RunHarmony(
object = seurat,
dims = 2:40,
group.by.vars = 'gem_id',
reduction = 'lsi',
assay.use = 'peaks_macs',
project.dim = FALSE,
max.iter.harmony = 20
)
seurat <- RunUMAP(seurat, reduction = "harmony", dims = 2:23)
DimPlot(seurat, split.by = "assay")
DimPlot(seurat, split.by = "age_group")
saveRDS(seurat, path_to_save)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.4.1 harmony_1.0 Rcpp_1.0.6 ggpubr_0.4.0 reshape2_1.4.4 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.5 tidyverse_1.3.0 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 reticulate_1.20 tidyselect_1.1.1 htmlwidgets_1.5.3 grid_4.0.3 docopt_0.7.1 BiocParallel_1.22.0 Rtsne_0.15 munsell_0.5.0 codetools_0.2-17 ica_1.0-2 future_1.21.0 miniUI_0.1.1.1 withr_2.4.2 colorspace_2.0-2 knitr_1.30 rstudioapi_0.11 stats4_4.0.3 ROCR_1.0-11 ggsignif_0.6.0 tensor_1.5 listenv_0.8.0 labeling_0.4.2 slam_0.1-47 GenomeInfoDbData_1.2.3 polyclip_1.10-0 farver_2.1.0 rprojroot_2.0.2 parallelly_1.26.1 vctrs_0.3.8 generics_0.1.0 xfun_0.18 lsa_0.73.2 ggseqlogo_0.1 R6_2.5.0 GenomeInfoDb_1.24.2 bitops_1.0-7 spatstat.utils_2.2-0 assertthat_0.2.1 promises_1.2.0.1 scales_1.1.1 gtable_0.3.0 globals_0.14.0 goftest_1.2-2 rlang_0.4.11 RcppRoll_0.3.0 splines_4.0.3 rstatix_0.6.0 lazyeval_0.2.2 spatstat.geom_2.2-0 broom_0.7.2
## [52] BiocManager_1.30.10 yaml_2.2.1 abind_1.4-5 modelr_0.1.8 backports_1.1.10 httpuv_1.6.1 tools_4.0.3 bookdown_0.21 ellipsis_0.3.2 spatstat.core_2.2-0 RColorBrewer_1.1-2 BiocGenerics_0.34.0 ggridges_0.5.3 plyr_1.8.6 zlibbioc_1.34.0 RCurl_1.98-1.2 rpart_4.1-15 deldir_0.2-10 pbapply_1.4-3 cowplot_1.1.1 S4Vectors_0.26.0 zoo_1.8-9 haven_2.3.1 ggrepel_0.9.1 cluster_2.1.0 fs_1.5.0 here_1.0.1 magrittr_2.0.1 RSpectra_0.16-0 data.table_1.14.0 scattermore_0.7 openxlsx_4.2.4 lmtest_0.9-38 reprex_0.3.0 RANN_2.6.1 SnowballC_0.7.0 fitdistrplus_1.1-5 matrixStats_0.59.0 hms_0.5.3 patchwork_1.1.1 mime_0.11 evaluate_0.14 xtable_1.8-4 rio_0.5.16 sparsesvd_0.2 readxl_1.3.1 IRanges_2.22.1 gridExtra_2.3 compiler_4.0.3 KernSmooth_2.23-17 crayon_1.4.1
## [103] htmltools_0.5.1.1 mgcv_1.8-33 later_1.2.0 lubridate_1.7.9 DBI_1.1.0 tweenr_1.0.1 dbplyr_1.4.4 MASS_7.3-53 Matrix_1.3-4 car_3.0-10 cli_3.0.0 parallel_4.0.3 igraph_1.2.6 GenomicRanges_1.40.0 pkgconfig_2.0.3 foreign_0.8-80 spatstat.sparse_2.0-0 xml2_1.3.2 XVector_0.28.0 rvest_0.3.6 digest_0.6.27 sctransform_0.3.2 RcppAnnoy_0.0.18 spatstat.data_2.1-0 Biostrings_2.56.0 rmarkdown_2.5 cellranger_1.1.0 leiden_0.3.8 fastmatch_1.1-0 uwot_0.1.10 curl_4.3.2 shiny_1.6.0 Rsamtools_2.4.0 lifecycle_1.0.0 nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4 viridisLite_0.4.0 fansi_0.5.0 pillar_1.6.1 lattice_0.20-41 fastmap_1.1.0 httr_1.4.2 survival_3.2-7 glue_1.4.2 qlcMatrix_0.9.7 zip_2.1.1 png_0.1-7 ggforce_0.3.2 stringi_1.6.2 blob_1.2.1
## [154] irlba_2.3.3 future.apply_1.7.0